base_dir = '../'
import numpy as np
import pandas as pd
import shap as shap
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from uszipcode import SearchEngine
from pandas_profiling import ProfileReport
# settings to display all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
data = pd.read_csv(base_dir + 'data/modelingData_SpaceTourism_modified.csv')
data['zip'] = data['zip'].astype(int)
data.head()
| id | choice | zip | year_birth | gender | annual_income | household_annual_income | number_vehicles | level_education | work_type | children_home | household_type | status_in_household | type_residence | housing_tenure_type | origin | race | citizenship | risk_activities_sports | price_attribute | availability | probability_fatality | training | number_passengers | takeoff_location | price_dollars | alternative | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 55906 | 1976 | 1 | 8 | 9 | 1 | 7 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 3 | 1 | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 453125.0 | suborbital |
| 1 | 1 | 0 | 55906 | 1976 | 1 | 8 | 9 | 1 | 7 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 3 | 1 | 1 | 2 | 0 | 0 | 1 | 0 | 0 | 453125.0 | orbital |
| 2 | 1 | 1 | 55906 | 1976 | 1 | 8 | 9 | 1 | 7 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 3750.0 | moon_trip |
| 3 | 1 | 0 | 55906 | 1976 | 1 | 8 | 9 | 1 | 7 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 3 | 1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 | 0.0 | not_travel |
| 4 | 2 | 1 | 55906 | 1976 | 1 | 8 | 9 | 1 | 7 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 3 | 1 | 1 | 2 | 0 | 0 | 1 | 0 | 0 | 453125.0 | suborbital |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8640 entries, 0 to 8639 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 8640 non-null int64 1 choice 8640 non-null int64 2 zip 8640 non-null int64 3 year_birth 8640 non-null int64 4 gender 8640 non-null int64 5 annual_income 8640 non-null int64 6 household_annual_income 8640 non-null int64 7 number_vehicles 8640 non-null int64 8 level_education 8640 non-null int64 9 work_type 8640 non-null int64 10 children_home 8640 non-null int64 11 household_type 8640 non-null int64 12 status_in_household 8640 non-null int64 13 type_residence 8640 non-null int64 14 housing_tenure_type 8640 non-null int64 15 origin 8640 non-null int64 16 race 8640 non-null int64 17 citizenship 8640 non-null int64 18 risk_activities_sports 8640 non-null int64 19 price_attribute 8640 non-null int64 20 availability 8640 non-null int64 21 probability_fatality 8640 non-null int64 22 training 8640 non-null int64 23 number_passengers 8640 non-null int64 24 takeoff_location 8640 non-null int64 25 price_dollars 8640 non-null float64 26 alternative 8640 non-null object dtypes: float64(1), int64(25), object(1) memory usage: 1.8+ MB
cleanup_nums = {
'gender': {
1: 'male',
2: 'female'
},
#! ADD ORDINAL CATEGORIES
'annual_income': {
1: 'a_less_than_10k',
2: 'b_10k_15k',
3: 'c_15k_25k',
4: 'd_25k_35k',
5: 'e_35k_50k',
6: 'f_50k_75k',
7: 'g_75k_100k',
8: 'h_100k_150k',
9: 'i_150k_200k',
10: 'j_more_than_200k'
},
#! ADD ORDINAL CATEGORIES
'household_annual_income': {
1: 'a_less_than_10k',
2: 'b_10k_15k',
3: 'c_15k_25k',
4: 'd_25k_35k',
5: 'e_35k_50k',
6: 'f_50k_75k',
7: 'g_75k_100k',
8: 'h_100k_150k',
9: 'i_150k_200k',
10: 'j_more_than_200k'
},
#! ADD ORDINAL CATEGORIES
'number_vehicles': {
1: '1_car',
2: '2_cars',
3: '3_cars',
4: '4_or_more_cars'
},
#! ADD ORDINAL CATEGORIES
'level_education': {
1: 'less_9th_grade',
2: '9th_12th_grade_nodiploma',
3: 'high_school_graduate',
4: 'some_college',
5: 'associate_degree',
6: 'bachelor_degree',
7: 'grad_prof_degree'
},
'work_type': {
1: 'private',
2: 'government',
3: 'self_employed',
4: 'unpaid_work'
},
#! ADD ORDINAL CATEGORIES
'children_home': {
1: '0_children',
2: '1_child',
3: '2_children',
4: '3_children',
5: '4_children',
6: '5_children_or_more'
},
'household_type': {
1: '1_couple_with_children',
2: '0_couple_no_children',
3: '1_male_children',
4: '0_male_no_children',
5: '1_female_children',
6: '0_female_no_children',
7: '0_alone',
8: '2_other'
},
'status_in_household': {
1: 'head',
2: 'spouse',
3: 'child',
4: 'other'
},
'type_residence': {
1: 'house',
2: 'apartment',
3: 'other'
},
'housing_tenure_type': {
1: 'own',
2: 'rent'
},
'origin': {
1: 'hispanic',
2: 'non_hispanic'
},
'race': {
1: 'white',
2: 'black',
3: 'asian',
4: 'hawaian_pacific',
5: 'other_race',
6: 'two_or_more_races'
},
'citizenship': {
1: 'us_citizen',
2: 'other'
},
#! ADD ORDINAL CATEGORIES
'risk_activities_sports': {
1: 'never',
2: 'rarely',
3: 'often'
},
'price_attribute': {
0: '3_perc_annual_income',
1: '50_perc_annual_income',
2: '362_perc_annual_income',
-1: np.nan
},
'availability': {
0: 'immediate',
1: 'in_5_years',
-1: np.nan
},
'probability_fatality': {
0: 0.5,
1: 7.5,
-1: np.nan
},
'training': {
0: 'no',
1: 'yes',
-1: np.nan
},
'number_passengers': {
0: 'one',
1: 'more_than_one',
-1: np.nan
},
'takeoff_location': {
0: 'usa',
1: 'other',
-1: np.nan
}
}
data = data.replace(cleanup_nums)
data.head()
| id | choice | zip | year_birth | gender | annual_income | household_annual_income | number_vehicles | level_education | work_type | children_home | household_type | status_in_household | type_residence | housing_tenure_type | origin | race | citizenship | risk_activities_sports | price_attribute | availability | probability_fatality | training | number_passengers | takeoff_location | price_dollars | alternative | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 55906 | 1976 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | 7.5 | no | one | usa | 453125.0 | suborbital |
| 1 | 1 | 0 | 55906 | 1976 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | 0.5 | yes | one | usa | 453125.0 | orbital |
| 2 | 1 | 1 | 55906 | 1976 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 3_perc_annual_income | immediate | 0.5 | no | more_than_one | usa | 3750.0 | moon_trip |
| 3 | 1 | 0 | 55906 | 1976 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | not_travel |
| 4 | 2 | 1 | 55906 | 1976 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | 0.5 | yes | one | usa | 453125.0 | suborbital |
# Creating a column with 4 categories for each alternative
list_alt = [1, 2, 3, 4]
data['list_alt'] = np.tile(list_alt, len(data) // len(list_alt))
# Creating four dataframes with the different alternatives
data_suborbital = data[data['list_alt'] == 1]
data_orbital = data[data['list_alt'] == 2]
data_moon_trip = data[data['list_alt'] == 3]
data_not_travel = data[data['list_alt'] == 4]
data_suborbital = data_suborbital.rename(
columns={
'choice': 'suborbital',
'price_attribute': 'price_attribute_suborbital',
'availability': 'availability_suborbital',
'probability_fatality': 'probability_fatality_suborbital',
'training': 'training_suborbital',
'number_passengers': 'number_passengers_suborbital',
'takeoff_location': 'takeoff_location_suborbital',
'price_dollars': 'price_dollars_suborbital',
})
data_orbital = data_orbital.rename(
columns={
'choice': 'orbital',
'price_attribute': 'price_attribute_orbital',
'availability': 'availability_orbital',
'probability_fatality': 'probability_fatality_orbital',
'training': 'training_orbital',
'number_passengers': 'number_passengers_orbital',
'takeoff_location': 'takeoff_location_orbital',
'price_dollars': 'price_dollars_orbital',
})
data_moon_trip = data_moon_trip.rename(
columns={
'choice': 'moon_trip',
'price_attribute': 'price_attribute_moon_trip',
'availability': 'availability_moon_trip',
'probability_fatality': 'probability_fatality_moon_trip',
'training': 'training_moon_trip',
'number_passengers': 'number_passengers_moon_trip',
'takeoff_location': 'takeoff_location_moon_trip',
'price_dollars': 'price_dollars_moon_trip',
})
data_not_travel = data_not_travel.rename(
columns={
'choice': 'not_travel',
'price_attribute': 'price_attribute_not_travel',
'availability': 'availability_not_travel',
'probability_fatality': 'probability_fatality_not_travel',
'training': 'training_not_travel',
'number_passengers': 'number_passengers_not_travel',
'takeoff_location': 'takeoff_location_not_travel',
'price_dollars': 'price_dollars_not_travel',
})
# Merging the dataframes
merged_data = pd.merge(pd.merge(
pd.merge(data_suborbital,
data_orbital[[
'id', 'orbital', 'price_attribute_orbital',
'availability_orbital', 'probability_fatality_orbital',
'training_orbital', 'number_passengers_orbital',
'takeoff_location_orbital', 'price_dollars_orbital'
]],
on='id',
suffixes=('_suborbital', '_suborbital')),
data_moon_trip[[
'id', 'moon_trip', 'price_attribute_moon_trip',
'availability_moon_trip', 'probability_fatality_moon_trip',
'training_moon_trip', 'number_passengers_moon_trip',
'takeoff_location_moon_trip', 'price_dollars_moon_trip'
]],
on='id',
suffixes=('', '_moon_trip')),
data_not_travel[['id', 'not_travel']],
on='id',
suffixes=('', '_not_travel'))
merged_data.head()
# Creating the column with the choice
merged_data['choice'] = merged_data[[
'suborbital', 'orbital', 'moon_trip', 'not_travel'
]].idxmax(axis=1)
# Putting the choice in first place
choice_column = merged_data.pop('choice')
merged_data.insert(0, 'choice', choice_column)
# Changing year_birth for age
merged_data['age'] = 2022 - merged_data['year_birth']
# Transforming age to generation
merged_data['generation_age'] = np.where(
merged_data['year_birth'] < 1928, 'greater_generation',
np.where(
merged_data['year_birth'] < 1946, 'traditionalist',
np.where(
merged_data['year_birth'] < 1964, 'baby_boomers',
np.where(
merged_data['year_birth'] < 1976, 'gen_x',
np.where(merged_data['year_birth'] < 1995, 'millenials',
'centennials')))))
merged_data = merged_data.drop(columns=['year_birth'])
merged_data['generation_age'].value_counts().sort_index()
baby_boomers 216 centennials 312 gen_x 492 millenials 1140 Name: generation_age, dtype: int64
def city(zip):
try:
engine = SearchEngine()
return engine.by_zipcode(zip).major_city
except:
return np.nan
def state(zip):
try:
engine = SearchEngine()
return engine.by_zipcode(zip).state
except:
return np.nan
# Assigning the zipcode to the city
engine = SearchEngine()
merged_data['city'] = merged_data['zip'].apply(city)
merged_data['state'] = merged_data['zip'].apply(state)
# Aggregating states using https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#/media/File:Census_Regions_and_Division_of_the_United_States.svg
dict_states = {
'west': [
'AZ', 'CA', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'OR', 'WA', 'AK',
'HI'
],
'midwest':
['ND', 'SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'WI', 'IL', 'IN', 'MI', 'OH'],
'south': [
'TX', 'LA', 'MS', 'AL', 'AR', 'OK', 'FL', 'GA', 'KY', 'NC', 'SC', 'TN',
'VA', 'WV', 'DC', 'MD', 'DE'
],
'northeast': ['NY', 'PA', 'NJ', 'CT', 'RI', 'MA', 'VT', 'NH', 'ME']
}
def argcontains(item):
for i, v in dict_states.items():
if item in v:
return i
return np.nan
merged_data['region'] = merged_data['state'].map(argcontains)
# Dropping the unnecessary columns
merged_data = merged_data.drop(columns=[
'id', 'alternative', 'list_alt', 'suborbital', 'orbital', 'moon_trip',
'not_travel'
])
merged_data.head()
| choice | zip | gender | annual_income | household_annual_income | number_vehicles | level_education | work_type | children_home | household_type | status_in_household | type_residence | housing_tenure_type | origin | race | citizenship | risk_activities_sports | price_attribute_suborbital | availability_suborbital | probability_fatality_suborbital | training_suborbital | number_passengers_suborbital | takeoff_location_suborbital | price_dollars_suborbital | price_attribute_orbital | availability_orbital | probability_fatality_orbital | training_orbital | number_passengers_orbital | takeoff_location_orbital | price_dollars_orbital | price_attribute_moon_trip | availability_moon_trip | probability_fatality_moon_trip | training_moon_trip | number_passengers_moon_trip | takeoff_location_moon_trip | price_dollars_moon_trip | age | generation_age | city | state | region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | moon_trip | 55906 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | 7.5 | no | one | usa | 453125.0 | 362_perc_annual_income | immediate | 0.5 | yes | one | usa | 453125.0 | 3_perc_annual_income | immediate | 0.5 | no | more_than_one | usa | 3750.0 | 46 | millenials | Rochester | MN | midwest |
| 1 | suborbital | 55906 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | 0.5 | yes | one | usa | 453125.0 | 362_perc_annual_income | immediate | 0.5 | yes | more_than_one | other | 453125.0 | 362_perc_annual_income | in_5_years | 7.5 | no | one | usa | 453125.0 | 46 | millenials | Rochester | MN | midwest |
| 2 | moon_trip | 55906 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 50_perc_annual_income | in_5_years | 7.5 | yes | one | usa | 62500.0 | 362_perc_annual_income | in_5_years | 0.5 | no | one | other | 453125.0 | 50_perc_annual_income | immediate | 0.5 | no | more_than_one | other | 62500.0 | 46 | millenials | Rochester | MN | midwest |
| 3 | moon_trip | 55906 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 3_perc_annual_income | in_5_years | 7.5 | no | more_than_one | usa | 3750.0 | 50_perc_annual_income | immediate | 7.5 | no | one | other | 62500.0 | 50_perc_annual_income | in_5_years | 0.5 | yes | one | usa | 62500.0 | 46 | millenials | Rochester | MN | midwest |
| 4 | suborbital | 55906 | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 50_perc_annual_income | in_5_years | 0.5 | yes | more_than_one | usa | 62500.0 | 50_perc_annual_income | immediate | 0.5 | no | more_than_one | other | 62500.0 | 3_perc_annual_income | immediate | 7.5 | yes | more_than_one | usa | 3750.0 | 46 | millenials | Rochester | MN | midwest |
data = merged_data.copy()
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2160 entries, 0 to 2159 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 choice 2160 non-null object 1 zip 2160 non-null int64 2 gender 2160 non-null object 3 annual_income 2160 non-null object 4 household_annual_income 2160 non-null object 5 number_vehicles 2160 non-null object 6 level_education 2160 non-null object 7 work_type 2160 non-null object 8 children_home 2160 non-null object 9 household_type 2160 non-null object 10 status_in_household 2160 non-null object 11 type_residence 2160 non-null object 12 housing_tenure_type 2160 non-null object 13 origin 2160 non-null object 14 race 2160 non-null object 15 citizenship 2160 non-null object 16 risk_activities_sports 2160 non-null object 17 price_attribute_suborbital 2160 non-null object 18 availability_suborbital 2160 non-null object 19 probability_fatality_suborbital 2160 non-null float64 20 training_suborbital 2160 non-null object 21 number_passengers_suborbital 2160 non-null object 22 takeoff_location_suborbital 2160 non-null object 23 price_dollars_suborbital 2160 non-null float64 24 price_attribute_orbital 2160 non-null object 25 availability_orbital 2160 non-null object 26 probability_fatality_orbital 2160 non-null float64 27 training_orbital 2160 non-null object 28 number_passengers_orbital 2160 non-null object 29 takeoff_location_orbital 2160 non-null object 30 price_dollars_orbital 2160 non-null float64 31 price_attribute_moon_trip 2160 non-null object 32 availability_moon_trip 2160 non-null object 33 probability_fatality_moon_trip 2160 non-null float64 34 training_moon_trip 2160 non-null object 35 number_passengers_moon_trip 2160 non-null object 36 takeoff_location_moon_trip 2160 non-null object 37 price_dollars_moon_trip 2160 non-null float64 38 age 2160 non-null int64 39 generation_age 2160 non-null object 40 city 2076 non-null object 41 state 2076 non-null object 42 region 2076 non-null object dtypes: float64(6), int64(2), object(35) memory usage: 742.5+ KB
# The following zipcodes do not have information about city and state
data_without_region = data[data['region'].isnull()]
data_without_region['zip'].unique()
array([10429, 70028, 19632, 53427, 54321])
data['region'].value_counts(normalize=True, dropna=False)
west 0.361111 south 0.333333 northeast 0.166667 midwest 0.100000 NaN 0.038889 Name: region, dtype: float64
Not too imbalanced.
# Creating additional features because Rodrigo suggested me to do so
## probability_fatality
data['average_probability_fatality'] = data[[
'probability_fatality_orbital', 'probability_fatality_suborbital',
'probability_fatality_moon_trip'
]].mean(axis=1)
data['delta_probability_fatality_orbital'] = data[
'average_probability_fatality'] - data['probability_fatality_orbital']
data['delta_probability_fatality_suborbital'] = data[
'average_probability_fatality'] - data['probability_fatality_suborbital']
data['delta_probability_fatality_moon_trip'] = data[
'average_probability_fatality'] - data['probability_fatality_moon_trip']
## price_attribute
data['average_price_dollars'] = data[[
'price_dollars_orbital', 'price_dollars_suborbital',
'price_dollars_moon_trip'
]].mean(axis=1)
data['delta_price_dollars_orbital'] = data['average_price_dollars'] - data[
'price_dollars_orbital']
data['delta_price_dollars_suborbital'] = data['average_price_dollars'] - data[
'price_dollars_suborbital']
data['delta_price_dollars_moon_trip'] = data['average_price_dollars'] - data[
'price_dollars_moon_trip']
# Dropping features because Rodrigo suggested me to do so
data = data.drop(columns=[
'zip', 'probability_fatality_orbital', 'probability_fatality_suborbital',
'probability_fatality_moon_trip', 'price_dollars_orbital',
'price_dollars_suborbital', 'price_dollars_moon_trip'
],
axis=1)
print('Data shape: ', data.shape)
data.head()
Data shape: (2160, 44)
| choice | gender | annual_income | household_annual_income | number_vehicles | level_education | work_type | children_home | household_type | status_in_household | type_residence | housing_tenure_type | origin | race | citizenship | risk_activities_sports | price_attribute_suborbital | availability_suborbital | training_suborbital | number_passengers_suborbital | takeoff_location_suborbital | price_attribute_orbital | availability_orbital | training_orbital | number_passengers_orbital | takeoff_location_orbital | price_attribute_moon_trip | availability_moon_trip | training_moon_trip | number_passengers_moon_trip | takeoff_location_moon_trip | age | generation_age | city | state | region | average_probability_fatality | delta_probability_fatality_orbital | delta_probability_fatality_suborbital | delta_probability_fatality_moon_trip | average_price_dollars | delta_price_dollars_orbital | delta_price_dollars_suborbital | delta_price_dollars_moon_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | moon_trip | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | no | one | usa | 362_perc_annual_income | immediate | yes | one | usa | 3_perc_annual_income | immediate | no | more_than_one | usa | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | -4.666667 | 2.333333 | 303333.333333 | -149791.666667 | -149791.666667 | 299583.333333 |
| 1 | suborbital | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | yes | one | usa | 362_perc_annual_income | immediate | yes | more_than_one | other | 362_perc_annual_income | in_5_years | no | one | usa | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | 2.333333 | -4.666667 | 453125.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2 | moon_trip | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 50_perc_annual_income | in_5_years | yes | one | usa | 362_perc_annual_income | in_5_years | no | one | other | 50_perc_annual_income | immediate | no | more_than_one | other | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | -4.666667 | 2.333333 | 192708.333333 | -260416.666667 | 130208.333333 | 130208.333333 |
| 3 | moon_trip | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 3_perc_annual_income | in_5_years | no | more_than_one | usa | 50_perc_annual_income | immediate | no | one | other | 50_perc_annual_income | in_5_years | yes | one | usa | 46 | millenials | Rochester | MN | midwest | 5.166667 | -2.333333 | -2.333333 | 4.666667 | 42916.666667 | -19583.333333 | 39166.666667 | -19583.333333 |
| 4 | suborbital | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 50_perc_annual_income | in_5_years | yes | more_than_one | usa | 50_perc_annual_income | immediate | no | more_than_one | other | 3_perc_annual_income | immediate | yes | more_than_one | usa | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | 2.333333 | -4.666667 | 42916.666667 | -19583.333333 | -19583.333333 | 39166.666667 |
# Analyzing status_in_household per age
data[['status_in_household',
'age']][data['status_in_household'] == 'child'].describe()
| age | |
|---|---|
| count | 108.000000 |
| mean | 42.888889 |
| std | 16.422764 |
| min | 21.000000 |
| 25% | 28.000000 |
| 50% | 41.000000 |
| 75% | 57.000000 |
| max | 67.000000 |
# Analyzing status_in_household per age
data[['status_in_household',
'age']].groupby('status_in_household').mean().plot(kind='bar',
figsize=(20, 5),
fontsize=20)
<AxesSubplot:xlabel='status_in_household'>
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
# Analyzing annual_income against household_annual_income
income = pd.crosstab(data['annual_income'], data['household_annual_income'])
a = income.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('annual_income', fontsize=20)
a.set_ylabel('household_annual_income', fontsize=20)
Text(0, 0.5, 'household_annual_income')
# Analyzing household_type against children_home
children_home = pd.crosstab(data['household_type'], data['children_home'])
a = children_home.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('household_type', fontsize=20)
a.set_ylabel('children_home', fontsize=20)
Text(0, 0.5, 'children_home')
data_filtered = data.copy()
print("Shape before filtering: ", data_filtered.shape)
# Removing rows where house_annual_income is lower than annual_income
data_filtered = data_filtered.drop(data_filtered[
(data_filtered['annual_income'] == 'b_10k_15k')
& (data_filtered['household_annual_income'] == 'a_less_than_10k')].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'c_15k_25k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'd_25k_35k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k')
| (data_filtered['household_annual_income'] == 'c_15k_25k'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'e_35k_50k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k')
| (data_filtered['household_annual_income'] == 'c_15k_25k')
| (data_filtered['household_annual_income'] == 'd_25k_35k'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'f_50k_75k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k')
| (data_filtered['household_annual_income'] == 'c_15k_25k')
| (data_filtered['household_annual_income'] == 'd_25k_35k')
| (data_filtered['household_annual_income'] == 'e_35k_50k'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'g_75k_100k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k')
| (data_filtered['household_annual_income'] == 'c_15k_25k')
| (data_filtered['household_annual_income'] == 'd_25k_35k')
| (data_filtered['household_annual_income'] == 'e_35k_50k')
| (data_filtered['household_annual_income'] == 'f_50k_75k'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'h_100k_150k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k')
| (data_filtered['household_annual_income'] == 'c_15k_25k')
| (data_filtered['household_annual_income'] == 'd_25k_35k')
| (data_filtered['household_annual_income'] == 'e_35k_50k')
| (data_filtered['household_annual_income'] == 'f_50k_75k')
| (data_filtered['household_annual_income'] == 'g_75k_100k'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'i_150k_200k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k')
| (data_filtered['household_annual_income'] == 'c_15k_25k')
| (data_filtered['household_annual_income'] == 'd_25k_35k')
| (data_filtered['household_annual_income'] == 'e_35k_50k')
| (data_filtered['household_annual_income'] == 'f_50k_75k')
| (data_filtered['household_annual_income'] == 'g_75k_100k')
| (data_filtered['household_annual_income'] == 'h_100k_150k'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['annual_income'] == 'j_more_than_200k') & (
(data_filtered['household_annual_income'] == 'a_less_than_10k')
| (data_filtered['household_annual_income'] == 'b_10k_15k')
| (data_filtered['household_annual_income'] == 'c_15k_25k')
| (data_filtered['household_annual_income'] == 'd_25k_35k')
| (data_filtered['household_annual_income'] == 'e_35k_50k')
| (data_filtered['household_annual_income'] == 'f_50k_75k')
| (data_filtered['household_annual_income'] == 'g_75k_100k')
| (data_filtered['household_annual_income'] == 'h_100k_150k')
| (data_filtered['household_annual_income'] == 'i_150k_200k'))].index)
# Removing rows where household_type and children_home do not coincide
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['household_type'] == '0_alone') & (
(data_filtered['children_home'] == '1_child')
| (data_filtered['children_home'] == '2_children')
| (data_filtered['children_home'] == '3_children')
| (data_filtered['children_home'] == '4_children')
| (data_filtered['children_home'] == '5_children_or_more'))].index)
data_filtered = data_filtered.drop(data_filtered[
(data_filtered['household_type'] == '0_couple_no_children')
& ((data_filtered['children_home'] == '1_child')
| (data_filtered['children_home'] == '2_children')
| (data_filtered['children_home'] == '3_children')
| (data_filtered['children_home'] == '4_children')
| (data_filtered['children_home'] == '5_children_or_more'))].index)
data_filtered = data_filtered.drop(data_filtered[
(data_filtered['household_type'] == '0_female_no_children')
& ((data_filtered['children_home'] == '1_child')
| (data_filtered['children_home'] == '2_children')
| (data_filtered['children_home'] == '3_children')
| (data_filtered['children_home'] == '4_children')
| (data_filtered['children_home'] == '5_children_or_more'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['household_type'] == '0_male_no_children') & (
(data_filtered['children_home'] == '1_child')
| (data_filtered['children_home'] == '2_children')
| (data_filtered['children_home'] == '3_children')
| (data_filtered['children_home'] == '4_children')
| (data_filtered['children_home'] == '5_children_or_more'))].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['household_type'] == '1_couple_with_children')
& (data_filtered['children_home'] == '0_children')].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['household_type'] == '1_female_children')
& (data_filtered['children_home'] == '0_children')].index)
data_filtered = data_filtered.drop(
data_filtered[(data_filtered['household_type'] == '1_male_children')
& (data_filtered['children_home'] == '0_children')].index)
# status_in_household to be excluded in the model
print("Shape after filtering: ", data_filtered.shape)
Shape before filtering: (2160, 44) Shape after filtering: (1908, 44)
data_filtered.describe()
| age | average_probability_fatality | delta_probability_fatality_orbital | delta_probability_fatality_suborbital | delta_probability_fatality_moon_trip | average_price_dollars | delta_price_dollars_orbital | delta_price_dollars_suborbital | delta_price_dollars_moon_trip | |
|---|---|---|---|---|---|---|---|---|---|
| count | 1908.000000 | 1908.000000 | 1908.000000 | 1908.000000 | 1908.000000 | 1908.000000 | 1908.000000 | 1908.000000 | 1908.000000 |
| mean | 41.201258 | 4.011006 | -0.022013 | 0.011006 | 0.011006 | 65917.531447 | 306.132075 | -153.066038 | -153.066038 |
| std | 12.005651 | 1.782546 | 2.694918 | 3.391547 | 2.910430 | 75154.656241 | 71495.911836 | 80245.691536 | 83181.737163 |
| min | 21.000000 | 0.500000 | -4.666667 | -4.666667 | -4.666667 | 150.000000 | -520833.333333 | -599166.666667 | -599166.666667 |
| 25% | 31.000000 | 2.833333 | -2.333333 | -2.333333 | -2.333333 | 14591.666667 | -20970.833333 | -20970.833333 | -20970.833333 |
| 50% | 38.000000 | 5.166667 | 0.000000 | 0.000000 | 0.000000 | 42466.666667 | -391.666667 | 4700.000000 | 0.000000 |
| 75% | 51.000000 | 5.166667 | 2.333333 | 2.333333 | 2.333333 | 96354.166667 | 35950.000000 | 37612.500000 | 36458.333333 |
| max | 72.000000 | 7.500000 | 4.666667 | 4.666667 | 4.666667 | 906250.000000 | 419416.666667 | 419416.666667 | 599166.666667 |
# Analyzing annual_income against household_annual_income after filtering
income_2 = pd.crosstab(data_filtered['annual_income'],
data_filtered['household_annual_income'])
a = income_2.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('annual_income', fontsize=20)
a.set_ylabel('household_annual_income', fontsize=20)
Text(0, 0.5, 'household_annual_income')
# Analyzing household_type against children_home after filtering
children_home_2 = pd.crosstab(data_filtered['household_type'],
data_filtered['children_home'])
a = children_home_2.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('household_type', fontsize=20)
a.set_ylabel('children_home', fontsize=20)
Text(0, 0.5, 'children_home')
profile = ProfileReport(data_filtered, title='Report')
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
profile.to_file(base_dir + 'data/modelingData_SpaceTourism_strings_v4_Report.html')
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
data_filtered.to_csv(base_dir +
'data/modelingData_SpaceTourism_strings_v4.csv',
index=False)
data = pd.read_csv(base_dir + 'data/modelingData_SpaceTourism_strings_v4.csv')
data.head()
| choice | gender | annual_income | household_annual_income | number_vehicles | level_education | work_type | children_home | household_type | status_in_household | type_residence | housing_tenure_type | origin | race | citizenship | risk_activities_sports | price_attribute_suborbital | availability_suborbital | training_suborbital | number_passengers_suborbital | takeoff_location_suborbital | price_attribute_orbital | availability_orbital | training_orbital | number_passengers_orbital | takeoff_location_orbital | price_attribute_moon_trip | availability_moon_trip | training_moon_trip | number_passengers_moon_trip | takeoff_location_moon_trip | age | generation_age | city | state | region | average_probability_fatality | delta_probability_fatality_orbital | delta_probability_fatality_suborbital | delta_probability_fatality_moon_trip | average_price_dollars | delta_price_dollars_orbital | delta_price_dollars_suborbital | delta_price_dollars_moon_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | moon_trip | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | no | one | usa | 362_perc_annual_income | immediate | yes | one | usa | 3_perc_annual_income | immediate | no | more_than_one | usa | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | -4.666667 | 2.333333 | 303333.333333 | -149791.666667 | -149791.666667 | 299583.333333 |
| 1 | suborbital | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 362_perc_annual_income | immediate | yes | one | usa | 362_perc_annual_income | immediate | yes | more_than_one | other | 362_perc_annual_income | in_5_years | no | one | usa | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | 2.333333 | -4.666667 | 453125.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2 | moon_trip | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 50_perc_annual_income | in_5_years | yes | one | usa | 362_perc_annual_income | in_5_years | no | one | other | 50_perc_annual_income | immediate | no | more_than_one | other | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | -4.666667 | 2.333333 | 192708.333333 | -260416.666667 | 130208.333333 | 130208.333333 |
| 3 | moon_trip | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 3_perc_annual_income | in_5_years | no | more_than_one | usa | 50_perc_annual_income | immediate | no | one | other | 50_perc_annual_income | in_5_years | yes | one | usa | 46 | millenials | Rochester | MN | midwest | 5.166667 | -2.333333 | -2.333333 | 4.666667 | 42916.666667 | -19583.333333 | 39166.666667 | -19583.333333 |
| 4 | suborbital | male | h_100k_150k | i_150k_200k | 1_car | grad_prof_degree | private | 1_child | 1_couple_with_children | head | house | own | non_hispanic | asian | us_citizen | never | 50_perc_annual_income | in_5_years | yes | more_than_one | usa | 50_perc_annual_income | immediate | no | more_than_one | other | 3_perc_annual_income | immediate | yes | more_than_one | usa | 46 | millenials | Rochester | MN | midwest | 2.833333 | 2.333333 | 2.333333 | -4.666667 | 42916.666667 | -19583.333333 | -19583.333333 | 39166.666667 |
# Looking different levels of categorical features
for i in data[[
'number_vehicles', 'level_education', 'children_home',
'risk_activities_sports', 'generation_age'
]].columns:
print(i, '\n', data[i].value_counts(normalize=True))
print('---------------------------------\n')
number_vehicles 2_cars 0.522013 1_car 0.371069 3_cars 0.062893 4_or_more_cars 0.044025 Name: number_vehicles, dtype: float64 --------------------------------- level_education bachelor_degree 0.616352 grad_prof_degree 0.150943 some_college 0.125786 high_school_graduate 0.069182 associate_degree 0.037736 Name: level_education, dtype: float64 --------------------------------- children_home 0_children 0.377358 2_children 0.308176 1_child 0.264151 4_children 0.018868 3_children 0.018868 5_children_or_more 0.012579 Name: children_home, dtype: float64 --------------------------------- risk_activities_sports rarely 0.459119 never 0.358491 often 0.182390 Name: risk_activities_sports, dtype: float64 --------------------------------- generation_age millenials 0.547170 gen_x 0.245283 centennials 0.119497 baby_boomers 0.088050 Name: generation_age, dtype: float64 ---------------------------------
from pycaret.classification import *
exp1 = setup(
data=data,
target='choice',
session_id=42,
normalize=True,
#normalize_method='minmax',
#transformation=True,
#pca = True,
ignore_features=['city', 'state', 'generation_age', 'status_in_household'],
ordinal_features={
'annual_income': [
'a_less_than_10k', 'b_10k_15k', 'c_15k_25k', 'd_25k_35k',
'e_35k_50k', 'f_50k_75k', 'g_75k_100k', 'h_100k_150k',
'i_150k_200k', 'j_more_than_200k'
],
'household_annual_income': [
'a_less_than_10k', 'b_10k_15k', 'c_15k_25k', 'd_25k_35k',
'e_35k_50k', 'f_50k_75k', 'g_75k_100k', 'h_100k_150k',
'i_150k_200k', 'j_more_than_200k'
],
'number_vehicles': ['1_car', '2_cars', '3_cars', '4_or_more_cars'],
'level_education': [
# 'less_9th_grade', '9th_12th_grade_nodiploma', # Nobody in those ranges
'high_school_graduate',
'some_college',
'associate_degree',
'bachelor_degree',
'grad_prof_degree'
],
'children_home': [
'0_children', '1_child', '2_children', '3_children', '4_children',
'5_children_or_more'
],
'risk_activities_sports': ['never', 'rarely', 'often'],
# 'generation_age':
# ['baby_boomers', 'gen_x', 'millenials', 'centennials']
},
train_size=0.8,
use_gpu=True,
combine_rare_levels=True, # Added
remove_multicollinearity=True, # Added
unknown_categorical_method='most_frequent', # Added
remove_outliers=True, # Added
fix_imbalance=True,
data_split_stratify=True,
fold_strategy='stratifiedkfold',
silent=True,
log_experiment=True,
experiment_name='first_exp',
)
| Description | Value | |
|---|---|---|
| 0 | session_id | 42 |
| 1 | Target | choice |
| 2 | Target Type | Multiclass |
| 3 | Label Encoded | moon_trip: 0, not_travel: 1, orbital: 2, suborbital: 3 |
| 4 | Original Data | (1908, 44) |
| 5 | Missing Values | True |
| 6 | Numeric Features | 9 |
| 7 | Categorical Features | 30 |
| 8 | Ordinal Features | True |
| 9 | High Cardinality Features | False |
| 10 | High Cardinality Method | None |
| 11 | Transformed Train Set | (1449, 62) |
| 12 | Transformed Test Set | (382, 62) |
| 13 | Shuffle Train-Test | True |
| 14 | Stratify Train-Test | True |
| 15 | Fold Generator | StratifiedKFold |
| 16 | Fold Number | 10 |
| 17 | CPU Jobs | -1 |
| 18 | Use GPU | True |
| 19 | Log Experiment | True |
| 20 | Experiment Name | first_exp |
| 21 | USI | 2565 |
| 22 | Imputation Type | simple |
| 23 | Iterative Imputation Iteration | None |
| 24 | Numeric Imputer | mean |
| 25 | Iterative Imputation Numeric Model | None |
| 26 | Categorical Imputer | constant |
| 27 | Iterative Imputation Categorical Model | None |
| 28 | Unknown Categoricals Handling | most_frequent |
| 29 | Normalize | True |
| 30 | Normalize Method | zscore |
| 31 | Transformation | False |
| 32 | Transformation Method | None |
| 33 | PCA | False |
| 34 | PCA Method | None |
| 35 | PCA Components | None |
| 36 | Ignore Low Variance | False |
| 37 | Combine Rare Levels | True |
| 38 | Rare Level Threshold | 0.100000 |
| 39 | Numeric Binning | False |
| 40 | Remove Outliers | True |
| 41 | Outliers Threshold | 0.050000 |
| 42 | Remove Multicollinearity | True |
| 43 | Multicollinearity Threshold | 0.900000 |
| 44 | Remove Perfect Collinearity | True |
| 45 | Clustering | False |
| 46 | Clustering Iteration | None |
| 47 | Polynomial Features | False |
| 48 | Polynomial Degree | None |
| 49 | Trignometry Features | False |
| 50 | Polynomial Threshold | None |
| 51 | Group Features | False |
| 52 | Feature Selection | False |
| 53 | Feature Selection Method | classic |
| 54 | Features Selection Threshold | None |
| 55 | Feature Interaction | False |
| 56 | Feature Ratio | False |
| 57 | Interaction Threshold | None |
| 58 | Fix Imbalance | True |
| 59 | Fix Imbalance Method | SMOTE |
best_models = compare_models(turbo=False, sort='auc', round=3)
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| gbc | Gradient Boosting Classifier | 0.540 | 0.788 | 0.539 | 0.541 | 0.538 | 0.383 | 0.384 | 1.861 |
| lightgbm | Light Gradient Boosting Machine | 0.539 | 0.787 | 0.537 | 0.541 | 0.538 | 0.382 | 0.383 | 16.177 |
| catboost | CatBoost Classifier | 0.524 | 0.777 | 0.523 | 0.525 | 0.522 | 0.364 | 0.365 | 8.267 |
| mlp | MLP Classifier | 0.508 | 0.753 | 0.507 | 0.513 | 0.508 | 0.342 | 0.343 | 54.289 |
| rbfsvm | SVM - Radial Kernel | 0.481 | 0.739 | 0.480 | 0.483 | 0.480 | 0.306 | 0.307 | 1.281 |
| ada | Ada Boost Classifier | 0.472 | 0.731 | 0.476 | 0.480 | 0.471 | 0.297 | 0.299 | 0.263 |
| lr | Logistic Regression | 0.483 | 0.729 | 0.484 | 0.485 | 0.481 | 0.310 | 0.311 | 13.012 |
| lda | Linear Discriminant Analysis | 0.475 | 0.729 | 0.476 | 0.477 | 0.474 | 0.300 | 0.301 | 0.275 |
| rf | Random Forest Classifier | 0.461 | 0.726 | 0.458 | 0.462 | 0.459 | 0.278 | 0.279 | 0.903 |
| et | Extra Trees Classifier | 0.433 | 0.701 | 0.429 | 0.432 | 0.431 | 0.241 | 0.241 | 1.431 |
| qda | Quadratic Discriminant Analysis | 0.319 | 0.681 | 0.285 | 0.268 | 0.189 | 0.049 | 0.116 | 0.338 |
| nb | Naive Bayes | 0.300 | 0.647 | 0.307 | 0.390 | 0.204 | 0.077 | 0.144 | 0.092 |
| knn | K Neighbors Classifier | 0.375 | 0.627 | 0.378 | 0.385 | 0.371 | 0.169 | 0.172 | 0.480 |
| dt | Decision Tree Classifier | 0.431 | 0.620 | 0.430 | 0.435 | 0.430 | 0.240 | 0.241 | 0.076 |
| gpc | Gaussian Process Classifier | 0.366 | 0.588 | 0.365 | 0.370 | 0.367 | 0.154 | 0.154 | 18.088 |
| dummy | Dummy Classifier | 0.258 | 0.500 | 0.250 | 0.067 | 0.106 | 0.000 | 0.000 | 0.113 |
| svm | SVM - Linear Kernel | 0.415 | 0.000 | 0.413 | 0.445 | 0.396 | 0.218 | 0.228 | 0.118 |
| ridge | Ridge Classifier | 0.481 | 0.000 | 0.482 | 0.482 | 0.478 | 0.308 | 0.309 | 0.167 |
# Best ML analysis
lightgbm = create_model(estimator='lightgbm', fit_kwargs={'verbose': -1})
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.5724 | 0.8014 | 0.5724 | 0.5759 | 0.5739 | 0.4277 | 0.4278 |
| 1 | 0.5448 | 0.7721 | 0.5445 | 0.5446 | 0.5437 | 0.3896 | 0.3901 |
| 2 | 0.5724 | 0.8090 | 0.5707 | 0.5734 | 0.5702 | 0.4264 | 0.4281 |
| 3 | 0.4552 | 0.7338 | 0.4547 | 0.4466 | 0.4483 | 0.2702 | 0.2715 |
| 4 | 0.5379 | 0.7648 | 0.5344 | 0.5356 | 0.5354 | 0.3808 | 0.3815 |
| 5 | 0.5517 | 0.8131 | 0.5427 | 0.5466 | 0.5477 | 0.3990 | 0.3998 |
| 6 | 0.4966 | 0.7705 | 0.4905 | 0.5074 | 0.4996 | 0.3244 | 0.3255 |
| 7 | 0.6000 | 0.8264 | 0.5945 | 0.6092 | 0.6018 | 0.4628 | 0.4640 |
| 8 | 0.5241 | 0.7859 | 0.5291 | 0.5251 | 0.5217 | 0.3629 | 0.3645 |
| 9 | 0.5347 | 0.7884 | 0.5339 | 0.5433 | 0.5347 | 0.3786 | 0.3806 |
| Mean | 0.5390 | 0.7865 | 0.5367 | 0.5408 | 0.5377 | 0.3822 | 0.3833 |
| Std | 0.0390 | 0.0260 | 0.0385 | 0.0416 | 0.0404 | 0.0521 | 0.0520 |
# Plotting the classification report
plot_model(estimator=lightgbm, plot='class_report', use_train_data=False)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
plot_model(lightgbm,
plot='confusion_matrix',
plot_kwargs={'percent': True},
use_train_data=False)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
# Plotting the AUC
plot_model(estimator=lightgbm, plot='auc', use_train_data=False)
plot_model(estimator=lightgbm, plot='error')
interpret_model(estimator=lightgbm)
# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
# tune hyperparameters to optimize AUC
tuned_lightgbm = tune_model(lightgbm,
optimize='AUC',
n_iter=80,
fit_kwargs={'verbose': -1},
early_stopping=True,
choose_better=True)
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.5655 | 0.8056 | 0.5681 | 0.5801 | 0.5704 | 0.4191 | 0.4201 |
| 1 | 0.5379 | 0.7654 | 0.5356 | 0.5393 | 0.5382 | 0.3809 | 0.3811 |
| 2 | 0.5517 | 0.7944 | 0.5536 | 0.5514 | 0.5503 | 0.3995 | 0.4002 |
| 3 | 0.5034 | 0.7561 | 0.5037 | 0.5064 | 0.5033 | 0.3343 | 0.3351 |
| 4 | 0.5241 | 0.7708 | 0.5217 | 0.5200 | 0.5207 | 0.3626 | 0.3632 |
| 5 | 0.5655 | 0.8113 | 0.5545 | 0.5527 | 0.5565 | 0.4164 | 0.4179 |
| 6 | 0.5793 | 0.7983 | 0.5782 | 0.5825 | 0.5797 | 0.4350 | 0.4356 |
| 7 | 0.6000 | 0.8320 | 0.5966 | 0.6014 | 0.5994 | 0.4631 | 0.4638 |
| 8 | 0.5034 | 0.7711 | 0.5055 | 0.4978 | 0.4972 | 0.3365 | 0.3381 |
| 9 | 0.5694 | 0.7907 | 0.5705 | 0.5864 | 0.5713 | 0.4258 | 0.4292 |
| Mean | 0.5500 | 0.7896 | 0.5488 | 0.5518 | 0.5487 | 0.3973 | 0.3984 |
| Std | 0.0306 | 0.0224 | 0.0298 | 0.0340 | 0.0319 | 0.0407 | 0.0408 |
# Plotting the classification report
plot_model(estimator=tuned_lightgbm, plot='class_report', use_train_data=False)
plot_model(tuned_lightgbm,
plot='confusion_matrix',
plot_kwargs={'percent': True},
use_train_data=False)
# Plotting the AUC
plot_model(estimator=tuned_lightgbm, plot='auc', use_train_data=False)
plot_model(estimator=tuned_lightgbm, plot='error')
interpret_model(estimator=tuned_lightgbm)
# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.
# Best ML analysis
gbc = create_model(estimator='gbc')
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.5862 | 0.8132 | 0.5880 | 0.5917 | 0.5882 | 0.4457 | 0.4461 |
| 1 | 0.5379 | 0.7643 | 0.5376 | 0.5392 | 0.5373 | 0.3808 | 0.3815 |
| 2 | 0.6069 | 0.8162 | 0.6090 | 0.6139 | 0.6097 | 0.4749 | 0.4753 |
| 3 | 0.5034 | 0.7625 | 0.4971 | 0.4924 | 0.4934 | 0.3329 | 0.3354 |
| 4 | 0.5172 | 0.7778 | 0.5174 | 0.5099 | 0.5119 | 0.3539 | 0.3547 |
| 5 | 0.5586 | 0.8120 | 0.5489 | 0.5500 | 0.5529 | 0.4076 | 0.4085 |
| 6 | 0.5034 | 0.7691 | 0.5021 | 0.5077 | 0.5040 | 0.3335 | 0.3343 |
| 7 | 0.6069 | 0.8227 | 0.6008 | 0.6145 | 0.6049 | 0.4709 | 0.4735 |
| 8 | 0.4966 | 0.7615 | 0.5027 | 0.4924 | 0.4929 | 0.3268 | 0.3275 |
| 9 | 0.4792 | 0.7779 | 0.4816 | 0.5014 | 0.4821 | 0.3048 | 0.3078 |
| Mean | 0.5396 | 0.7877 | 0.5385 | 0.5413 | 0.5377 | 0.3832 | 0.3844 |
| Std | 0.0450 | 0.0238 | 0.0441 | 0.0466 | 0.0462 | 0.0598 | 0.0596 |
# Plotting the classification report
plot_model(estimator=gbc, plot='class_report', use_train_data=False)
plot_model(gbc,
plot='confusion_matrix',
plot_kwargs={'percent': True},
use_train_data=False)
# Plotting the AUC
plot_model(estimator=gbc, plot='auc', use_train_data=False)
plot_model(estimator=gbc, plot='error')
interpret_model(estimator=gbc, plot='msa')
# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.
# tune hyperparameters to optimize AUC
tuned_gbc = tune_model(gbc,
optimize='AUC',
n_iter=80,
early_stopping=True,
choose_better=True)
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.5379 | 0.7934 | 0.5391 | 0.5409 | 0.5385 | 0.3815 | 0.3820 |
| 1 | 0.5517 | 0.7676 | 0.5501 | 0.5531 | 0.5501 | 0.3992 | 0.4006 |
| 2 | 0.5724 | 0.8057 | 0.5749 | 0.5690 | 0.5704 | 0.4281 | 0.4282 |
| 3 | 0.4759 | 0.7324 | 0.4777 | 0.4783 | 0.4748 | 0.3004 | 0.3015 |
| 4 | 0.5034 | 0.7603 | 0.5050 | 0.5054 | 0.5019 | 0.3368 | 0.3379 |
| 5 | 0.5931 | 0.8335 | 0.5849 | 0.5864 | 0.5891 | 0.4540 | 0.4544 |
| 6 | 0.5172 | 0.7782 | 0.5181 | 0.5309 | 0.5210 | 0.3525 | 0.3538 |
| 7 | 0.6000 | 0.8278 | 0.5914 | 0.5986 | 0.5973 | 0.4626 | 0.4637 |
| 8 | 0.5172 | 0.7814 | 0.5184 | 0.5207 | 0.5102 | 0.3535 | 0.3582 |
| 9 | 0.5486 | 0.7946 | 0.5494 | 0.5646 | 0.5521 | 0.3967 | 0.3984 |
| Mean | 0.5418 | 0.7875 | 0.5409 | 0.5448 | 0.5405 | 0.3865 | 0.3879 |
| Std | 0.0376 | 0.0291 | 0.0348 | 0.0353 | 0.0372 | 0.0495 | 0.0491 |
# Plotting the classification report
plot_model(estimator=tuned_gbc, plot='class_report', use_train_data=False)
plot_model(tuned_gbc,
plot='confusion_matrix',
plot_kwargs={'percent': True},
use_train_data=False)
# Plotting the AUC
plot_model(estimator=tuned_gbc, plot='auc', use_train_data=False)
plot_model(estimator=tuned_gbc, plot='error')
interpret_model(estimator=tuned_gbc, plot='msa')
# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.
# finalize a model
final_lightgbm = finalize_model(lightgbm)
final_tuned_lightgbm = finalize_model(tuned_lightgbm)
final_gbc = finalize_model(gbc)
final_tuned_gbc = finalize_model(tuned_gbc)
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
# save a model
save_model(final_lightgbm, base_dir + 'models/lightgbm_jupyter')
save_model(final_tuned_lightgbm, base_dir + 'models/tuned_lightgbm')
save_model(final_gbc, base_dir + 'models/gbc_jupyter')
save_model(final_tuned_gbc, base_dir + 'models/tuned_gbc_jupyter')
Transformation Pipeline and Model Successfully Saved Transformation Pipeline and Model Successfully Saved Transformation Pipeline and Model Successfully Saved Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=None,
steps=[('dtypes',
DataTypes_Auto_infer(categorical_features=[],
display_types=False,
features_todrop=['city', 'state',
'generation_age',
'status_in_household'],
id_columns=[],
ml_usecase='classification',
numerical_features=[], target='choice',
time_features=[])),
('imputer',
Simple_Imputer(categorical_strategy='not_available',
fill_value_ca...
learning_rate=0.1, loss='deviance',
max_depth=3, max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_iter_no_change=None,
presort='deprecated',
random_state=42, subsample=1.0,
tol=0.0001, validation_fraction=0.1,
verbose=0, warm_start=False)]],
verbose=False),
'../models/tuned_gbc_jupyter.pkl')
tuned_lightgbm
LGBMClassifier(bagging_fraction=0.8, bagging_freq=0, boosting_type='gbdt',
class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
importance_type='split', learning_rate=0.05, max_depth=-1,
min_child_samples=31, min_child_weight=0.001, min_split_gain=0.3,
n_estimators=130, n_jobs=-1, num_leaves=80, objective=None,
random_state=42, reg_alpha=2, reg_lambda=0.4, silent='warn',
subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
X = get_config('X')
y = get_config('y')
cv1 = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
labels = ['gbc', 'tuned_gbc', 'lightgbm', 'tuned_lightgbm']
models = [gbc, tuned_gbc, lightgbm, tuned_lightgbm]
for i, j in zip(labels, models):
score = cross_val_score(j, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
print(i + '_score: %.2f%% +/-(%.3f)' %
(np.mean(score * 100), np.std(score)))
gbc_score: 53.51% +/-(0.036) tuned_gbc_score: 53.51% +/-(0.036) lightgbm_score: 55.90% +/-(0.027) [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 tuned_lightgbm_score: 54.52% +/-(0.032)
for i, j in zip(labels, models):
score = cross_val_score(j,
X,
y,
scoring='roc_auc_ovr_weighted',
cv=cv1,
n_jobs=-1)
print(i + '_score: %.2f%% +/-(%.3f)' %
(np.mean(score * 100), np.std(score)))
gbc_score: 78.58% +/-(0.021) tuned_gbc_score: 78.58% +/-(0.021) lightgbm_score: 79.94% +/-(0.019) [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 tuned_lightgbm_score: 79.22% +/-(0.023)
for i, j in zip(labels, models):
score = cross_val_score(j,
X,
y,
scoring='roc_auc_ovo_weighted',
cv=cv1,
n_jobs=-1)
print(i + '_score: %.2f%% +/-(%.3f)' %
(np.mean(score * 100), np.std(score)))
gbc_score: 78.77% +/-(0.021) tuned_gbc_score: 78.77% +/-(0.021) lightgbm_score: 80.12% +/-(0.019) [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 tuned_lightgbm_score: 79.40% +/-(0.022)
for i, j in zip(labels, models):
score = cross_val_score(j, X, y, scoring='f1_weighted', cv=cv1, n_jobs=-1)
print(i + '_score: %.2f%% +/-(%.3f)' %
(np.mean(score * 100), np.std(score)))
gbc_score: 53.33% +/-(0.035) tuned_gbc_score: 53.33% +/-(0.035) lightgbm_score: 55.84% +/-(0.026) [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 [LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8 [LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8 [LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0 tuned_lightgbm_score: 54.21% +/-(0.030)
!jupyter nbconvert --to html eda_v2.ipynb
[NbConvertApp] Converting notebook eda_v2.ipynb to html [NbConvertApp] WARNING | Non-unique cell id 'e353102c' detected. Corrected to '05f8e080'. [NbConvertApp] WARNING | Non-unique cell id 'f6925884-c986-4e6c-9d3b-a09c6f62d269' detected. Corrected to '25cb6701'. [NbConvertApp] WARNING | Non-unique cell id 'b83a91f0-0294-4853-af22-773685e5626f' detected. Corrected to 'f44bc7ae'. [NbConvertApp] WARNING | Non-unique cell id '574e711d-4494-4e3b-9861-bf2c506425d2' detected. Corrected to 'b0aaf74c'. [NbConvertApp] WARNING | Non-unique cell id '9d2acf4f-45a6-43cb-b0bd-c29fe587010a' detected. Corrected to 'dc31989b'. [NbConvertApp] WARNING | Non-unique cell id '0080127b-75f3-4635-98b5-d6be1d9a8347' detected. Corrected to '4de6558a'. [NbConvertApp] WARNING | Non-unique cell id 'c995e0c7' detected. Corrected to '8055c5ff'. [NbConvertApp] WARNING | Non-unique cell id 'c22185cd' detected. Corrected to '986ab5e6'. [NbConvertApp] WARNING | Non-unique cell id 'f6925884-c986-4e6c-9d3b-a09c6f62d269' detected. Corrected to '7c5eaa53'. [NbConvertApp] WARNING | Non-unique cell id '677894bb-d02e-44cf-804f-7e73e1bb7005' detected. Corrected to '417cacf0'. [NbConvertApp] WARNING | Non-unique cell id 'b83a91f0-0294-4853-af22-773685e5626f' detected. Corrected to '5d3ab0bc'. [NbConvertApp] WARNING | Non-unique cell id '574e711d-4494-4e3b-9861-bf2c506425d2' detected. Corrected to '6c10aab2'. [NbConvertApp] WARNING | Non-unique cell id '9d2acf4f-45a6-43cb-b0bd-c29fe587010a' detected. Corrected to '6a397fb5'. [NbConvertApp] WARNING | Non-unique cell id '0080127b-75f3-4635-98b5-d6be1d9a8347' detected. Corrected to 'eec614c9'. [NbConvertApp] WARNING | Non-unique cell id 'c995e0c7' detected. Corrected to 'bc0d9f55'. [NbConvertApp] WARNING | Non-unique cell id 'f6925884-c986-4e6c-9d3b-a09c6f62d269' detected. Corrected to '7e9e9a3a'. [NbConvertApp] WARNING | Non-unique cell id '16733958' detected. Corrected to 'b1e88a30'. [NbConvertApp] WARNING | Non-unique cell id 'b83a91f0-0294-4853-af22-773685e5626f' detected. Corrected to '948c9952'. [NbConvertApp] WARNING | Non-unique cell id '574e711d-4494-4e3b-9861-bf2c506425d2' detected. Corrected to 'aa403d74'. [NbConvertApp] WARNING | Non-unique cell id '9d2acf4f-45a6-43cb-b0bd-c29fe587010a' detected. Corrected to '24a87f42'. [NbConvertApp] WARNING | Non-unique cell id '0080127b-75f3-4635-98b5-d6be1d9a8347' detected. Corrected to '323f82b0'. [NbConvertApp] WARNING | Non-unique cell id 'c995e0c7' detected. Corrected to '98550f58'. [NbConvertApp] WARNING | Non-unique cell id 'caa89228' detected. Corrected to '152daf8e'. [NbConvertApp] Writing 5485524 bytes to eda_v2.html